Список задач:

0. Подготовка данных

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
library(psych)
## 
## Присоединяю пакет: 'psych'
## 
## Следующие объекты скрыты от 'package:ggplot2':
## 
##     %+%, alpha
library(ggpubr)
library(patchwork)
library(kableExtra)
## 
## Присоединяю пакет: 'kableExtra'
## 
## Следующий объект скрыт от 'package:dplyr':
## 
##     group_rows
library(skimr)
library(readxl)
col_I_sn<-read_excel("stat2021_sm\\us_col.std\\I_shortname.xls")
## New names:
## • `` -> `...1`
col_I<-read_excel("stat2021_sm\\us_col.std\\I.xls")
## New names:
## • `` -> `...1`
head(col_I_sn)|>rmarkdown::paged_table()
summary(col_I_sn)
##      ...1               PPIND            FICE          STATE          
##  Length:176         Min.   :1.000   Min.   : 1009   Length:176        
##  Class :character   1st Qu.:1.000   1st Qu.: 1738   Class :character  
##  Mode  :character   Median :1.000   Median : 2550   Mode  :character  
##                     Mean   :1.301   Mean   : 2901                     
##                     3rd Qu.:2.000   3rd Qu.: 3417                     
##                     Max.   :2.000   Max.   :10366                     
##                                                                       
##      TYPE              AVRMATH         AVRVERB         AVRCOMB    
##  Length:176         Min.   :390.0   Min.   :391.0   Min.   : 810  
##  Class :character   1st Qu.:520.0   1st Qu.:454.5   1st Qu.: 980  
##  Mode  :character   Median :544.0   Median :479.5   Median :1017  
##                     Mean   :563.5   Mean   :494.7   Mean   :1058  
##                     3rd Qu.:603.0   3rd Qu.:518.5   3rd Qu.:1126  
##                     Max.   :750.0   Max.   :665.0   Max.   :1410  
##                     NA's   :68      NA's   :68      NA's   :67    
##     AVR_ACT          MATH_1          MATH_3          VERB_1     
##  Min.   :19.00   Min.   :350.0   Min.   :480.0   Min.   :320.0  
##  1st Qu.:22.00   1st Qu.:460.0   1st Qu.:590.0   1st Qu.:400.0  
##  Median :23.00   Median :500.0   Median :627.5   Median :430.0  
##  Mean   :23.69   Mean   :515.4   Mean   :635.0   Mean   :448.3  
##  3rd Qu.:25.00   3rd Qu.:570.0   3rd Qu.:687.5   3rd Qu.:480.0  
##  Max.   :31.00   Max.   :740.0   Max.   :780.0   Max.   :630.0  
##  NA's   :83      NA's   :34      NA's   :34      NA's   :34     
##      VERB_3          ACT_1           ACT_3          APP_REC     
##  Min.   :440.0   Min.   :16.00   Min.   :21.00   Min.   :  787  
##  1st Qu.:520.0   1st Qu.:19.00   1st Qu.:25.00   1st Qu.: 4323  
##  Median :550.0   Median :21.00   Median :26.00   Median : 7654  
##  Mean   :561.3   Mean   :21.28   Mean   :26.55   Mean   : 8544  
##  3rd Qu.:600.0   3rd Qu.:23.00   3rd Qu.:28.00   3rd Qu.:11776  
##  Max.   :720.0   Max.   :29.00   Max.   :33.00   Max.   :48094  
##  NA's   :34      NA's   :67      NA's   :67      NA's   :1      
##     APP_ACC         NEW_STUD        NEW10           NEW25          FULLTIME    
##  Min.   :  507   Min.   : 210   Min.   : 8.00   Min.   :24.00   Min.   :  912  
##  1st Qu.: 3033   1st Qu.:1264   1st Qu.:24.00   1st Qu.:52.00   1st Qu.: 5846  
##  Median : 4761   Median :1949   Median :32.00   Median :63.00   Median :10215  
##  Mean   : 5546   Mean   :2252   Mean   :41.48   Mean   :65.55   Mean   :11296  
##  3rd Qu.: 7232   3rd Qu.:3035   3rd Qu.:57.00   3rd Qu.:82.00   3rd Qu.:15033  
##  Max.   :26330   Max.   :7425   Max.   :98.00   Max.   :99.00   Max.   :31643  
##  NA's   :1                      NA's   :16      NA's   :26                     
##     PARTTIME          IN_STATE        OUT_STAT        R_B_COST   
##  Min.   :   16.0   Min.   :  647   Min.   : 2279   Min.   :2082  
##  1st Qu.:  804.8   1st Qu.: 2100   1st Qu.: 6712   1st Qu.:3588  
##  Median : 1694.5   Median : 3030   Median : 8400   Median :4213  
##  Mean   : 2519.9   Mean   : 6641   Mean   :10108   Mean   :4538  
##  3rd Qu.: 3240.8   3rd Qu.:12348   3rd Qu.:12668   3rd Qu.:5564  
##  Max.   :21836.0   Max.   :20100   Max.   :20100   Max.   :7425  
##  NA's   :6         NA's   :7       NA's   :1                     
##       ROOM          BOARD         ADD_FEE            BOOK           PERSONAL   
##  Min.   :1033   Min.   :1000   Min.   :  20.0   Min.   : 300.0   Min.   : 300  
##  1st Qu.:1810   1st Qu.:1762   1st Qu.: 210.0   1st Qu.: 500.0   1st Qu.:1164  
##  Median :2644   Median :2125   Median : 425.5   Median : 600.0   Median :1600  
##  Mean   :2708   Mean   :2241   Mean   : 648.1   Mean   : 603.1   Mean   :1763  
##  3rd Qu.:3490   3rd Qu.:2586   3rd Qu.: 694.0   3rd Qu.: 673.8   3rd Qu.:2200  
##  Max.   :6965   Max.   :4760   Max.   :4374.0   Max.   :1230.0   Max.   :6800  
##  NA's   :42     NA's   :64     NA's   :40       NA's   :2        NA's   :11    
##       PH_D           TERM_D         SF_RATIO         DONATE     
##  Min.   :63.00   Min.   :67.00   Min.   : 2.90   Min.   : 4.00  
##  1st Qu.:80.50   1st Qu.:87.00   1st Qu.:10.88   1st Qu.:10.75  
##  Median :87.00   Median :92.00   Median :14.50   Median :17.00  
##  Mean   :85.72   Mean   :90.52   Mean   :14.23   Mean   :19.01  
##  3rd Qu.:92.00   3rd Qu.:96.00   3rd Qu.:18.02   3rd Qu.:24.00  
##  Max.   :99.00   Max.   :99.00   Max.   :24.70   Max.   :54.00  
##  NA's   :9       NA's   :16                      NA's   :12     
##     INSTRUCT        GRADUAT         SAL_FULL          SAL_AC     
##  Min.   : 3605   Min.   :10.00   Min.   : 446.0   Min.   :364.0  
##  1st Qu.: 7604   1st Qu.:47.50   1st Qu.: 597.0   1st Qu.:445.0  
##  Median : 9840   Median :62.00   Median : 661.0   Median :479.5  
##  Mean   :12832   Mean   :62.02   Mean   : 669.2   Mean   :487.1  
##  3rd Qu.:14340   3rd Qu.:74.50   3rd Qu.: 732.2   3rd Qu.:521.2  
##  Max.   :62469   Max.   :99.00   Max.   :1009.0   Max.   :733.0  
##                  NA's   :5                                       
##      SAL_AS         SAL_ALL         COMP_FUL         COMP_AC     
##  Min.   :323.0   Min.   :362.0   Min.   : 537.0   Min.   :438.0  
##  1st Qu.:381.8   1st Qu.:472.2   1st Qu.: 729.0   1st Qu.:556.5  
##  Median :407.0   Median :522.5   Median : 815.0   Median :606.0  
##  Mean   :412.8   Mean   :534.0   Mean   : 827.2   Mean   :612.1  
##  3rd Qu.:434.2   3rd Qu.:578.2   3rd Qu.: 910.0   3rd Qu.:662.2  
##  Max.   :576.0   Max.   :866.0   Max.   :1236.0   Max.   :909.0  
##                                                                  
##     COMP_AS         COMP_ALL         NUM_FULL         NUM_AC     
##  Min.   :395.0   Min.   : 436.0   Min.   : 39.0   Min.   : 32.0  
##  1st Qu.:481.0   1st Qu.: 587.0   1st Qu.:184.5   1st Qu.:138.5  
##  Median :508.5   Median : 652.0   Median :278.0   Median :208.0  
##  Mean   :519.0   Mean   : 665.8   Mean   :336.8   Mean   :230.3  
##  3rd Qu.:552.2   3rd Qu.: 730.2   3rd Qu.:457.8   3rd Qu.:299.0  
##  Max.   :717.0   Max.   :1075.0   Max.   :997.0   Max.   :721.0  
##                                                   NA's   :1      
##      NUM_AS         NUM_INS          NUM_ALL      
##  Min.   : 29.0   Min.   :  0.00   Min.   : 108.0  
##  1st Qu.:128.5   1st Qu.:  5.00   1st Qu.: 505.5  
##  Median :175.0   Median : 16.00   Median : 721.0  
##  Mean   :190.7   Mean   : 27.09   Mean   : 812.8  
##  3rd Qu.:238.2   3rd Qu.: 40.00   3rd Qu.:1035.0  
##  Max.   :510.0   Max.   :178.00   Max.   :2261.0  
##                  NA's   :1
colSums(is.na(col_I_sn))
##     ...1    PPIND     FICE    STATE     TYPE  AVRMATH  AVRVERB  AVRCOMB 
##        0        0        0        0        0       68       68       67 
##  AVR_ACT   MATH_1   MATH_3   VERB_1   VERB_3    ACT_1    ACT_3  APP_REC 
##       83       34       34       34       34       67       67        1 
##  APP_ACC NEW_STUD    NEW10    NEW25 FULLTIME PARTTIME IN_STATE OUT_STAT 
##        1        0       16       26        0        6        7        1 
## R_B_COST     ROOM    BOARD  ADD_FEE     BOOK PERSONAL     PH_D   TERM_D 
##        0       42       64       40        2       11        9       16 
## SF_RATIO   DONATE INSTRUCT  GRADUAT SAL_FULL   SAL_AC   SAL_AS  SAL_ALL 
##        0       12        0        5        0        0        0        0 
## COMP_FUL  COMP_AC  COMP_AS COMP_ALL NUM_FULL   NUM_AC   NUM_AS  NUM_INS 
##        0        0        0        0        0        1        0        1 
##  NUM_ALL 
##        0
col_I_sn$PPIND=ifelse(col_I_sn$PPIND==1, "public", "private")
col_I_sn_noNA<-col_I_sn|>na.omit()
paste("col_I_sn dim: ", paste(dim(col_I_sn),collapse=", "))
## [1] "col_I_sn dim:  176, 49"
paste("col_I_sn_noNA dim: ", paste(dim(col_I_sn_noNA), collapse=", "))
## [1] "col_I_sn_noNA dim:  22, 49"

Огромная потеря в данных для case-wise. Следует либо применить pair-wise, либо заполнить данные.

Логарифмируем датасет

q_cols<-sapply(col_I_sn, is.numeric)
q_cols[c("PPIND", "FICE")]<-FALSE
q_cols<-names(q_cols)[q_cols==TRUE]
q_cols
##  [1] "AVRMATH"  "AVRVERB"  "AVRCOMB"  "AVR_ACT"  "MATH_1"   "MATH_3"  
##  [7] "VERB_1"   "VERB_3"   "ACT_1"    "ACT_3"    "APP_REC"  "APP_ACC" 
## [13] "NEW_STUD" "NEW10"    "NEW25"    "FULLTIME" "PARTTIME" "IN_STATE"
## [19] "OUT_STAT" "R_B_COST" "ROOM"     "BOARD"    "ADD_FEE"  "BOOK"    
## [25] "PERSONAL" "PH_D"     "TERM_D"   "SF_RATIO" "DONATE"   "INSTRUCT"
## [31] "GRADUAT"  "SAL_FULL" "SAL_AC"   "SAL_AS"   "SAL_ALL"  "COMP_FUL"
## [37] "COMP_AC"  "COMP_AS"  "COMP_ALL" "NUM_FULL" "NUM_AC"   "NUM_AS"  
## [43] "NUM_INS"  "NUM_ALL"
col_I_sn_log<-col_I_sn|>
  mutate(across(q_cols, ~log(.x)))|>
  rename_with(~paste0("log-", .x), q_cols)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(q_cols, ~log(.x))`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(q_cols)
## 
##   # Now:
##   data %>% select(all_of(q_cols))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
head(col_I_sn_log)|>rmarkdown::paged_table()

Так как в столбце NUM_INS были 0, они станут -inf и их стоит не учитывать

col_I_sn_log$`log-NUM_INS`[col_I_sn_log$`log-NUM_INS`==-Inf]=NA

Для обсуждения данных нужно сделать следующее (написано много, так как очень подробно):

  1. Разобраться в том, что означают признаки.

Количественные признаки:

  • AVRMATH Average Math SAT score
  • AVRVERB Average Verbal SAT score
  • AVRCOMB Average Combined SAT score
  • AVR_ACT Average ACT score
  • MATH_1 First quartile - Math SAT
  • MATH_3 Third quartile - Math SAT
  • VERB_1 First quartile - Verbal SAT
  • VERB_3 Third quartile - Verbal SAT
  • ACT_1 First quartile - ACT
  • ACT_3 Third quartile - ACT
  • APP_REC Number of applications received
  • APP_ACC Number of applicants accepted
  • NEW_STUD Number of new students enrolled
  • FULLTIME Number of fulltime undergraduates
  • PARTTIME Number of parttime undergraduates
  • IN_STATE In-state tuition
  • OUT_STAT Out-of-state tuition
  • R_B_COST Room and board costs
  • ROOM Room costs
  • BOARD Board costs
  • ADD_FEE Additional fees
  • BOOK Estimated book costs
  • PERSONAL Estimated personal spending
  • PH_D Pct. of faculty with Ph.D.’s
  • TERM_D Pct. of faculty with terminal degree
  • SAL_FULL Average salary - full professor
  • SAL_AC Average salary - associate professor
  • SAL_AS Average salary - assistant professor
  • SAL_ALL Average salary - all ranks
  • COMP_FUL Average compensation - full professor
  • COMP_AC Average compensation - associate professor
  • COMP_AS Average compensation - assistant professor
  • COMP_ALL Average compensation - all ranks
  • NUM_FULL Number of full professor
  • NUM_AC Number of associate professor
  • NUM_AS Number of assistant professor
  • NUM_INS Number of instructors
  • NUM_ALL Number of faculty - all ranks
  • INSTRUCT Instructional expenditure per student
  • GRADUAT Graduation rate
  • SF_RATIO Student/faculty ratio
  • DONATE Pct.alumni who donate
  • NEW10 Pct. new students from top 10% of H.S. class - % студентов из топ 10% своей старшей школы
  • NEW25 Pct. new students from top 25% of H.S. class - % студентов из топ 25% своей старшей школы

Порядковые признаки:

  • FICE - Federal ID Number

Качественные признаки:

  • …1 - Название университета
  • PPIND Public/private indicator (public=1, private=2)
  • STATE State (postal code)
  • TYPE - I (можно удалить)
  1. Если признаков очень много, то отобрать признаки (примерно 7-10) из следующих соображений: там должны быть признаки, упоминаемые в задании, и признаки, которые оказывают влияние на зависимую переменную в пункте 4 задания. Также, можно сократить число признаков, выбрав по представителю из каждой группы сильно коррелированных и похожих по смыслу признаков. При выборе представителей можно обращать внимание на число пропусков, на интерпретируемость и пр.

Рассматриваемые признаки (из .tsk):

  1. PPIND
  2. ADD_FEE
  3. BOOK
  4. NEW10 (зависимая переменная, также NEW25)

К ним можно добавить следующие признаки:

  1. AVRMATH, MATH_1, MATH_3, AVRVERB, VERB_1, VERB_3, AVR_ACT, ACT_1, ACT_3 и AVRCOMB (могут влиять на NEW10 по смыслу)
col_NEW10_sn<-col_I_sn[c("...1", "PPIND", "AVRMATH", "MATH_1", "MATH_3", "AVRVERB", "VERB_1", "VERB_3", "AVR_ACT", "ACT_1", "ACT_3", "AVRCOMB")]
  1. Определить вид признаков (колич., порядковые, качеств.). Для количеств. признаков определить, непрерывные они или дискретные (в том числе, дискретными могут стать непрерывные признаки, которые измерены с плохой точностью). Как вариант, это можно увидеть, посмотрев на частоту моды.
create_plots_list <- function(vec) {
    list(
        unique = unique(vec),
        hist = hist(vec, plot=FALSE),
        ecdf = ecdf(vec),
        ggqqplot = ggqqplot(vec),
        shapiro_test=shapiro.test(vec)
    )
}

col_I_sn_q_pub<-col_I_sn|>
  filter(PPIND=="public")|>
  select(q_cols)|>
  map(create_plots_list)|>
  set_names(q_cols)

col_I_sn_q_prv<-col_I_sn|>
  filter(PPIND=="private")|>
  select(q_cols)|>
  map(create_plots_list)|>
  set_names(q_cols)

col_I_sn_q_all<-col_I_sn|>
  select(q_cols)|>
  map(create_plots_list)|>
  set_names(q_cols)
imap(col_I_sn_q_all, ~plot(.x$ecdf, main = .y, sub = paste("p-value = ",round(.x$shapiro_test$p.value,5)))+plot(.x$ggqqplot, main=.y))

## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 68 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 68 rows containing non-finite outside the scale range (`stat_qq()`).
## Removed 68 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 68 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 67 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 83 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 34 rows containing non-finite outside the scale range (`stat_qq()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 34 rows containing non-finite outside the scale range (`stat_qq()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 34 rows containing non-finite outside the scale range (`stat_qq()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 67 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 67 rows containing non-finite outside the scale range (`stat_qq()`).
## Removed 67 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 67 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).
## Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).

## Warning: Removed 1 row containing non-finite outside the scale range (`stat_qq()`).
## Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).
## Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).

## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 16 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 26 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 6 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 7 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).
## Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).

## Warning: Removed 42 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 42 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 42 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 64 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 40 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 2 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 11 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 9 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 16 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 12 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 5 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).
## Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).

## Warning: Removed 1 row containing non-finite outside the scale range (`stat_qq()`).
## Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).
## Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).

## $AVRMATH
## NULL
## 
## $AVRVERB
## NULL
## 
## $AVRCOMB
## NULL
## 
## $AVR_ACT
## NULL
## 
## $MATH_1
## NULL
## 
## $MATH_3
## NULL
## 
## $VERB_1
## NULL
## 
## $VERB_3
## NULL
## 
## $ACT_1
## NULL
## 
## $ACT_3
## NULL
## 
## $APP_REC
## NULL
## 
## $APP_ACC
## NULL
## 
## $NEW_STUD
## NULL
## 
## $NEW10
## NULL
## 
## $NEW25
## NULL
## 
## $FULLTIME
## NULL
## 
## $PARTTIME
## NULL
## 
## $IN_STATE
## NULL
## 
## $OUT_STAT
## NULL
## 
## $R_B_COST
## NULL
## 
## $ROOM
## NULL
## 
## $BOARD
## NULL
## 
## $ADD_FEE
## NULL
## 
## $BOOK
## NULL
## 
## $PERSONAL
## NULL
## 
## $PH_D
## NULL
## 
## $TERM_D
## NULL
## 
## $SF_RATIO
## NULL
## 
## $DONATE
## NULL
## 
## $INSTRUCT
## NULL
## 
## $GRADUAT
## NULL
## 
## $SAL_FULL
## NULL
## 
## $SAL_AC
## NULL
## 
## $SAL_AS
## NULL
## 
## $SAL_ALL
## NULL
## 
## $COMP_FUL
## NULL
## 
## $COMP_AC
## NULL
## 
## $COMP_AS
## NULL
## 
## $COMP_ALL
## NULL
## 
## $NUM_FULL
## NULL
## 
## $NUM_AC
## NULL
## 
## $NUM_AS
## NULL
## 
## $NUM_INS
## NULL
## 
## $NUM_ALL
## NULL

Визуально по ecdf (Empirical CDF) можно разделить количественные признаки на непрерывные и дискретные следующим образом:

Дискретные:

  • AVR_ACT
  • MATH_1
  • MATH_3
  • VERB_1
  • VERB_3
  • ACT_1
  • ACT_3
  • PH_D ?
  • TERM_D
  • DONATE ? (В процентах, предположительно непрерывные, но визуально похожи на дискретные)

Непрерывные:

  • Остальные
q_cols_log<-paste0("log-",q_cols)
col_I_sn_q_all_log<-col_I_sn_log|>
  select(q_cols_log)|>
  map(create_plots_list)|>
  set_names(q_cols_log)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(q_cols_log)
## 
##   # Now:
##   data %>% select(all_of(q_cols_log))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
imap(col_I_sn_q_all_log, ~plot(.x$ecdf, main=.y, sub=paste0("p-value = ", round(.x$shapiro_test$p.value, 5)))+plot(.x$ggqqplot, main=.y))

## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 68 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 68 rows containing non-finite outside the scale range (`stat_qq()`).
## Removed 68 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 68 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 67 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 83 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 34 rows containing non-finite outside the scale range (`stat_qq()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 34 rows containing non-finite outside the scale range (`stat_qq()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 34 rows containing non-finite outside the scale range (`stat_qq()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 34 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 67 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 67 rows containing non-finite outside the scale range (`stat_qq()`).
## Removed 67 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 67 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).
## Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).

## Warning: Removed 1 row containing non-finite outside the scale range (`stat_qq()`).
## Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).
## Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).

## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 16 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 26 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 6 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 7 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).
## Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).

## Warning: Removed 42 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 42 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 42 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 64 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 40 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 2 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 11 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 9 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 16 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 12 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 5 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).
## Removed 1 row containing non-finite outside the scale range (`stat_qq_line()`).

## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_qq()`).
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_qq_line()`).
## Removed 27 rows containing non-finite outside the scale range
## (`stat_qq_line()`).

## $`log-AVRMATH`
## NULL
## 
## $`log-AVRVERB`
## NULL
## 
## $`log-AVRCOMB`
## NULL
## 
## $`log-AVR_ACT`
## NULL
## 
## $`log-MATH_1`
## NULL
## 
## $`log-MATH_3`
## NULL
## 
## $`log-VERB_1`
## NULL
## 
## $`log-VERB_3`
## NULL
## 
## $`log-ACT_1`
## NULL
## 
## $`log-ACT_3`
## NULL
## 
## $`log-APP_REC`
## NULL
## 
## $`log-APP_ACC`
## NULL
## 
## $`log-NEW_STUD`
## NULL
## 
## $`log-NEW10`
## NULL
## 
## $`log-NEW25`
## NULL
## 
## $`log-FULLTIME`
## NULL
## 
## $`log-PARTTIME`
## NULL
## 
## $`log-IN_STATE`
## NULL
## 
## $`log-OUT_STAT`
## NULL
## 
## $`log-R_B_COST`
## NULL
## 
## $`log-ROOM`
## NULL
## 
## $`log-BOARD`
## NULL
## 
## $`log-ADD_FEE`
## NULL
## 
## $`log-BOOK`
## NULL
## 
## $`log-PERSONAL`
## NULL
## 
## $`log-PH_D`
## NULL
## 
## $`log-TERM_D`
## NULL
## 
## $`log-SF_RATIO`
## NULL
## 
## $`log-DONATE`
## NULL
## 
## $`log-INSTRUCT`
## NULL
## 
## $`log-GRADUAT`
## NULL
## 
## $`log-SAL_FULL`
## NULL
## 
## $`log-SAL_AC`
## NULL
## 
## $`log-SAL_AS`
## NULL
## 
## $`log-SAL_ALL`
## NULL
## 
## $`log-COMP_FUL`
## NULL
## 
## $`log-COMP_AC`
## NULL
## 
## $`log-COMP_AS`
## NULL
## 
## $`log-COMP_ALL`
## NULL
## 
## $`log-NUM_FULL`
## NULL
## 
## $`log-NUM_AC`
## NULL
## 
## $`log-NUM_AS`
## NULL
## 
## $`log-NUM_INS`
## NULL
## 
## $`log-NUM_ALL`
## NULL
  1. Построить matrix plot (pairs plot), его долго разглядывать с точки зрения outliers, неоднородностей, вида распределений, вида зависимостей (линейные/нелинейные) и пр.
names_col<-names(col_I_sn)
names_col
##  [1] "...1"     "PPIND"    "FICE"     "STATE"    "TYPE"     "AVRMATH" 
##  [7] "AVRVERB"  "AVRCOMB"  "AVR_ACT"  "MATH_1"   "MATH_3"   "VERB_1"  
## [13] "VERB_3"   "ACT_1"    "ACT_3"    "APP_REC"  "APP_ACC"  "NEW_STUD"
## [19] "NEW10"    "NEW25"    "FULLTIME" "PARTTIME" "IN_STATE" "OUT_STAT"
## [25] "R_B_COST" "ROOM"     "BOARD"    "ADD_FEE"  "BOOK"     "PERSONAL"
## [31] "PH_D"     "TERM_D"   "SF_RATIO" "DONATE"   "INSTRUCT" "GRADUAT" 
## [37] "SAL_FULL" "SAL_AC"   "SAL_AS"   "SAL_ALL"  "COMP_FUL" "COMP_AC" 
## [43] "COMP_AS"  "COMP_ALL" "NUM_FULL" "NUM_AC"   "NUM_AS"   "NUM_INS" 
## [49] "NUM_ALL"
avr_col<-names_col[grep("AVR", names_col)]
avr_col
## [1] "AVRMATH" "AVRVERB" "AVRCOMB" "AVR_ACT"
act_col<-names_col[grep("ACT", names_col)]
num_col<-names_col[grep("NUM", names_col)]
sal_col<-names_col[grep("SAL", names_col)]
comp_col<-names_col[grep("COMP", names_col)]
new_col<-names_col[grep("NEW", names_col)]
math_col<-names_col[grep("MATH", names_col)]
verb_col<-names_col[grep("VERB", names_col)]
app_col<-names_col[grep("APP", names_col)]
time_col<-names_col[grep("TIME", names_col)]
cat_col<-c("PPIND", "FICE", "STATE")
stat_col<-c("IN_STATE", "OUT_STAT")
misc_col<-c("R_B_COST", "ROOM", "BOARD", "ADD_FEE", "BOOK", "PERSONAL", "INSTRUCT")
col_list <- list(avr_col, act_col, num_col, sal_col, comp_col, new_col, math_col, verb_col, app_col, time_col)


create_ggpairs <- function(cols) {
  ggpairs(col_I_sn[c(cols, "PPIND")], aes(alpha = 0.5, color=PPIND),
        diag = list(continuous = wrap("barDiag", bins = 20)))
}

ggpairs_list <- map(col_list, create_ggpairs)
names(ggpairs_list) <- c("avr", "act", "num", "sal", "comp", "new", "math", "verb", "app", "time")
ggpairs_list
## $avr
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 68 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 68 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 68 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 68 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 68 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 68 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 105 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 105 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 105 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_bin()`).

## 
## $act
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 95 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 95 rows containing missing values
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 95 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 67 rows containing missing values
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 95 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 67 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_bin()`).

## 
## $num
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## $sal
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## $comp
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## $new
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 26 rows containing missing values
## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 28 rows containing missing values
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 28 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_bin()`).

## 
## $math
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 75 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 75 rows containing missing values
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 75 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 34 rows containing missing values
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 75 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 34 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).

## 
## $verb
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 75 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 75 rows containing missing values
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 75 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 34 rows containing missing values
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 75 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 34 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).

## 
## $app
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

## 
## $time
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 6 rows containing missing values
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).

col_list_log<-lapply(col_list, function(x) paste0("log-", x))
col_list_log
## [[1]]
## [1] "log-AVRMATH" "log-AVRVERB" "log-AVRCOMB" "log-AVR_ACT"
## 
## [[2]]
## [1] "log-AVR_ACT" "log-ACT_1"   "log-ACT_3"  
## 
## [[3]]
## [1] "log-NUM_FULL" "log-NUM_AC"   "log-NUM_AS"   "log-NUM_INS"  "log-NUM_ALL" 
## 
## [[4]]
## [1] "log-SAL_FULL" "log-SAL_AC"   "log-SAL_AS"   "log-SAL_ALL" 
## 
## [[5]]
## [1] "log-COMP_FUL" "log-COMP_AC"  "log-COMP_AS"  "log-COMP_ALL"
## 
## [[6]]
## [1] "log-NEW_STUD" "log-NEW10"    "log-NEW25"   
## 
## [[7]]
## [1] "log-AVRMATH" "log-MATH_1"  "log-MATH_3" 
## 
## [[8]]
## [1] "log-AVRVERB" "log-VERB_1"  "log-VERB_3" 
## 
## [[9]]
## [1] "log-APP_REC" "log-APP_ACC"
## 
## [[10]]
## [1] "log-FULLTIME" "log-PARTTIME"
create_ggpairs_log <- function(cols) {
  ggpairs(col_I_sn_log[c(cols, "PPIND")], aes(alpha = 0.5, color=PPIND),
        diag = list(continuous = wrap("barDiag", bins = 20)))
}

ggpairs_list_log <- map(col_list_log, create_ggpairs_log)
names(ggpairs_list_log) <-paste0("log-", c("avr", "act", "num", "sal", "comp", "new", "math", "verb", "app", "time"))
ggpairs_list_log
## $`log-avr`
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 68 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 68 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 68 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 68 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 68 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 68 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 105 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 105 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 105 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_bin()`).

## 
## $`log-act`
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 95 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 95 rows containing missing values
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 95 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 67 rows containing missing values
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 95 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 67 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_bin()`).

## 
## $`log-num`
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning: Removed 27 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 27 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 27 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 27 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## $`log-sal`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## $`log-comp`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## $`log-new`
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 26 rows containing missing values
## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 28 rows containing missing values
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 28 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_bin()`).

## 
## $`log-math`
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 75 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 75 rows containing missing values
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 75 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 34 rows containing missing values
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 75 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 34 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).

## 
## $`log-verb`
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 75 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 75 rows containing missing values
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 75 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 34 rows containing missing values
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 75 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 34 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_bin()`).

## 
## $`log-app`
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

## 
## $`log-time`
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 6 rows containing missing values
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggpairs(col_I_sn[c(num_col, "PPIND")], aes(alpha = 0.5, color=PPIND),
        diag = list(continuous = wrap("barDiag", bins = 20)))
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  1. Если есть сильно несимметричные (с хвостом вправо) распределения на положительной полуоси, то прологарифмировать их и снова построить pairs plot - зависимости должны стать более линейными, а распределения более симметричными.

  2. Если есть outliers, то попробовать объяснить причину (ошибка в данных, особые индивиды) и удалить их.

# Function to detect outliers using boxplot.stats
find_outliers_boxplot <- function(x) {
  boxplot.stats(x)$out
}

# Apply the function to each column
outlier_list_boxplot <- lapply(col_I_sn[q_cols], find_outliers_boxplot)
names(outlier_list_boxplot) <- q_cols

lapply(outlier_list_boxplot, summary)
## $AVRMATH
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   390.0   649.5   739.0   654.5   744.0   750.0 
## 
## $AVRVERB
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   620.0   623.5   639.0   639.7   653.5   665.0 
## 
## $AVRCOMB
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1355    1374    1391    1387    1403    1410 
## 
## $AVR_ACT
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    30.0    30.0    30.5    30.5    31.0    31.0 
## 
## $MATH_1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     740     740     740     740     740     740 
## 
## $MATH_3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##                                                 
## 
## $VERB_1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     610     615     620     620     625     630 
## 
## $VERB_3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##                                                 
## 
## $ACT_1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##                                                 
## 
## $ACT_3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      33      33      33      33      33      33 
## 
## $APP_REC
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   48094   48094   48094   48094   48094   48094 
## 
## $APP_ACC
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14141   14857   16920   18578   20641   26330 
## 
## $NEW_STUD
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5873    5874    6180    6349    6392    7425 
## 
## $NEW10
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##                                                 
## 
## $NEW25
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##                                                 
## 
## $FULLTIME
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   30017   30528   31039   30900   31341   31643 
## 
## $PARTTIME
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7443    8402    9295   10274   10338   21836 
## 
## $IN_STATE
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##                                                 
## 
## $OUT_STAT
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##                                                 
## 
## $R_B_COST
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##                                                 
## 
## $ROOM
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6081    6302    6523    6523    6744    6965 
## 
## $BOARD
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3960    4250    4541    4420    4650    4760 
## 
## $ADD_FEE
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1748    3247    4092    3533    4128    4374 
## 
## $BOOK
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1000    1058    1115    1115    1172    1230 
## 
## $PERSONAL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4288    4916    5544    5544    6172    6800 
## 
## $PH_D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      63      63      63      63      63      63 
## 
## $TERM_D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   67.00   69.25   71.50   70.75   73.00   73.00 
## 
## $SF_RATIO
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##                                                 
## 
## $DONATE
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   46.00   48.25   49.00   49.67   51.25   54.00 
## 
## $INSTRUCT
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25765   28042   32090   35529   38011   62469 
## 
## $GRADUAT
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##                                                 
## 
## $SAL_FULL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   952.0   960.2   966.5   973.5   979.8  1009.0 
## 
## $SAL_AC
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   637.0   647.5   658.0   676.0   695.5   733.0 
## 
## $SAL_AS
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     515     519     528     534     532     576 
## 
## $SAL_ALL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   742.0   746.5   757.0   782.1   808.5   866.0 
## 
## $COMP_FUL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1204    1212    1220    1220    1228    1236 
## 
## $COMP_AC
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   831.0   850.5   870.0   870.0   889.5   909.0 
## 
## $COMP_AS
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   674.0   677.0   695.0   695.2   713.2   717.0 
## 
## $COMP_ALL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   951.0   979.5   994.5  1003.8  1018.8  1075.0 
## 
## $NUM_FULL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   893.0   920.2   938.0   944.2   973.0   997.0 
## 
## $NUM_AC
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   552.0   615.5   679.0   650.7   700.0   721.0 
## 
## $NUM_AS
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   427.0   443.0   472.0   466.4   480.0   510.0 
## 
## $NUM_INS
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    93.0   107.8   122.5   130.0   156.2   178.0 
## 
## $NUM_ALL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1840    1887    1910    1979    2034    2261
remove_outliers <- function(df, outlier_list) {
  df_cleaned <- list()

  for (i in seq_along(df)) {  
      attr_name <- names(df)[i]
      outliers <- outlier_list[[attr_name]]
      if (!is.null(outliers)) {
        df_cleaned[[attr_name]] <- df[[attr_name]][!(df[[attr_name]] %in% outliers)]
      }
  }
  df_cleaned
}

df_cleaned <- remove_outliers(col_I_sn[q_cols], outlier_list_boxplot)
# Function to create a data frame with specific attribute and PPIND, filtered for "public" and "private"
create_filtered_df <- function(df, attr_name, outliers) {
    df %>%
      select(PPIND, attr_name) |>
      filter(if (!is.null(outliers)) !(!!sym(attr_name) %in% outliers)) #Remove outliers
}

# Create a list of data frames
attribute_names <- names(df)[-1]  #Exclude PPIND
list_dfs <- map2(q_cols, outlier_list_boxplot, ~ create_filtered_df(col_I_sn[c("PPIND", q_cols)], .x, .y))
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(attr_name)
## 
##   # Now:
##   data %>% select(all_of(attr_name))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
names(list_dfs) <- q_cols
create_boxplot <- function(df, attr_name) {
  ggplot(df, aes(x = 1, y = !!sym(attr_name), fill = PPIND)) +
    geom_boxplot() +
    labs(title = paste("Boxplot of", attr_name, "by PPIND"), y = attr_name) +
    theme_bw()
}

boxplot_list <- map2(list_dfs, names(list_dfs), ~create_boxplot(.x, .y))
names(boxplot_list) <- names(list_dfs)

boxplot_list
## $AVRMATH
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $AVRVERB
## Warning: Removed 68 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $AVRCOMB
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $AVR_ACT
## Warning: Removed 83 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $MATH_1
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $MATH_3
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $VERB_1
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $VERB_3
## Warning: Removed 34 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $ACT_1
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $ACT_3
## Warning: Removed 67 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $APP_REC
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $APP_ACC
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $NEW_STUD

## 
## $NEW10
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $NEW25
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $FULLTIME

## 
## $PARTTIME
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $IN_STATE
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $OUT_STAT
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $R_B_COST

## 
## $ROOM
## Warning: Removed 42 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $BOARD
## Warning: Removed 64 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $ADD_FEE
## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $BOOK
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $PERSONAL
## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $PH_D
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $TERM_D
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $SF_RATIO

## 
## $DONATE
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $INSTRUCT

## 
## $GRADUAT
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $SAL_FULL

## 
## $SAL_AC

## 
## $SAL_AS

## 
## $SAL_ALL

## 
## $COMP_FUL

## 
## $COMP_AC

## 
## $COMP_AS

## 
## $COMP_ALL

## 
## $NUM_FULL

## 
## $NUM_AC
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $NUM_AS

## 
## $NUM_INS
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

## 
## $NUM_ALL

  1. Посмотрите также на descriptive statistics с точки зрения минимумов-максимумов, асимметрии, эксцесса и пр.
describe(col_I_sn)|>rmarkdown::paged_table()
results <- describeBy(col_I_sn, group=col_I_sn$PPIND)
df_results <- map_dfr(results, ~as.data.frame(.x), .id = "group")
kable(df_results, "html") %>%
  kable_styling(full_width = FALSE) %>%
  scroll_box(width = "100%", height = "400px")
group vars n mean sd median trimmed mad min max range skew kurtosis se
…1 private 1 53 56.283019 43.411881 45.0 52.116279 44.47800 1.0 153.0 152.0 0.6989808 -0.6825807 5.9630805
PPIND…2 private 2 53 1.000000 0.000000 1.0 1.000000 0.00000 1.0 1.0 0.0 NaN NaN 0.0000000
FICE…3 private 3 53 2466.622642 999.857590 2506.0 2381.883721 1111.95000 1131.0 6967.0 5836.0 1.6419950 5.3901228 137.3410025
STATE…4 private 4 53 25.188679 13.556231 25.0 25.139535 14.82600 5.0 49.0 44.0 -0.0172640 -1.4525694 1.8620916
TYPE…5 private 5 53 1.000000 0.000000 1.0 1.000000 0.00000 1.0 1.0 0.0 NaN NaN 0.0000000
AVRMATH…6 private 6 28 627.392857 76.594816 645.0 628.708333 74.13000 457.0 750.0 293.0 -0.2642842 -0.9977512 14.4750597
AVRVERB…7 private 7 28 554.714286 69.140682 557.0 554.833333 90.43860 426.0 665.0 239.0 -0.0601699 -1.3487042 13.0663607
AVRCOMB…8 private 8 29 1177.034483 144.286046 1199.0 1177.680000 161.60340 883.0 1410.0 527.0 -0.0893581 -1.2296542 26.7932461
AVR_ACT…9 private 9 19 26.842105 3.041622 28.0 27.000000 4.44780 20.0 31.0 11.0 -0.4302713 -0.8546401 0.6977959
MATH_1…10 private 10 51 573.137255 76.687423 590.0 573.536585 88.95600 390.0 740.0 350.0 -0.1120325 -0.7689927 10.7383871
MATH_3…11 private 11 51 680.294118 61.393581 690.0 682.243902 74.13000 510.0 780.0 270.0 -0.3845836 -0.6015429 8.5968208
VERB_1…12 private 12 51 501.274510 64.956933 500.0 500.560976 88.95600 360.0 630.0 270.0 0.0415450 -1.1069331 9.0957900
VERB_3…13 private 13 51 610.823529 59.854726 610.0 610.829268 74.13000 470.0 720.0 250.0 -0.0480738 -0.9266880 8.3813381
ACT_1…14 private 14 31 23.870968 2.837157 24.0 23.920000 2.96520 17.0 29.0 12.0 -0.2379435 -0.8258970 0.5095685
ACT_3…15 private 15 31 28.967742 2.562341 29.0 29.080000 2.96520 22.0 33.0 11.0 -0.4485672 -0.2000764 0.4602100
APP_REC…16 private 16 53 8022.905660 3989.729894 7654.0 7900.116279 4971.15780 1754.0 19227.0 17473.0 0.4132197 -0.4657545 548.0315482
APP_ACC…17 private 17 53 4059.603774 2025.552672 3587.0 3869.139535 2145.32220 507.0 10516.0 10009.0 0.9046848 0.7021832 278.2310574
NEW_STUD…18 private 18 53 1458.603774 806.308930 1236.0 1367.465116 541.14900 210.0 4615.0 4405.0 1.5081159 3.0032572 110.7550493
NEW10…19 private 19 52 57.788461 24.379625 58.0 57.761905 33.35850 16.0 98.0 82.0 0.0381849 -1.4295240 3.3808458
NEW25…20 private 20 49 80.061224 16.181790 82.0 81.268293 19.27380 42.0 99.0 57.0 -0.4992293 -0.9830636 2.3116843
FULLTIME…21 private 21 53 6300.943396 4195.174029 5217.0 5667.976744 2206.10880 912.0 27378.0 26466.0 2.6271796 9.8371061 576.2514704
PARTTIME…22 private 22 47 978.127660 1663.402480 484.0 638.256410 625.65720 16.0 10221.0 10205.0 3.8946926 17.8900634 242.6321886
IN_STATE…23 private 23 53 15624.150943 4194.954650 17020.0 16299.906977 2772.46200 2340.0 20100.0 17760.0 -1.3898856 1.3723903 576.2213365
OUT_STAT…24 private 24 53 15747.358491 3924.905335 17020.0 16306.186047 2772.46200 2340.0 20100.0 17760.0 -1.3082365 1.3192370 539.1272103
R_B_COST…25 private 25 53 5839.811321 997.070479 5975.0 5922.255814 816.91260 3320.0 7425.0 4105.0 -0.7233444 -0.2295138 136.9581633
ROOM…26 private 26 45 3321.444444 851.135166 3370.0 3269.216216 554.49240 1920.0 6965.0 5045.0 1.5052588 5.3161502 126.8797393
BOARD…27 private 27 43 2778.860465 725.254556 2775.0 2746.914286 639.00060 1400.0 4760.0 3360.0 0.4804975 0.4064642 110.6002829
ADD_FEE…28 private 28 46 451.934783 345.880416 405.0 403.842105 289.10700 40.0 1836.0 1796.0 1.7753185 4.1159323 50.9972852
BOOK…29 private 29 53 610.075472 158.110946 600.0 597.534884 148.26000 300.0 1230.0 930.0 1.2325851 3.0965764 21.7182087
PERSONAL…30 private 30 52 1493.807692 917.587246 1300.0 1358.380952 444.78000 300.0 6800.0 6500.0 3.8445961 19.0680802 127.2464564
PH_D…31 private 31 49 89.591837 7.865850 91.0 90.317073 7.41300 71.0 99.0 28.0 -0.8078233 -0.4898133 1.1236929
TERM_D…32 private 32 48 93.395833 5.700838 95.0 94.225000 4.44780 76.0 99.0 23.0 -1.4266943 1.5793545 0.8228451
SF_RATIO…33 private 33 53 9.813208 4.326885 9.2 9.555814 4.89258 2.9 20.5 17.6 0.4758614 -0.7282486 0.5943434
DONATE…34 private 34 53 27.037736 11.977483 25.0 26.441861 11.86080 4.0 54.0 50.0 0.4168192 -0.6654638 1.6452338
INSTRUCT…35 private 35 53 21194.415094 11855.236580 18287.0 19478.418605 9826.67280 7503.0 62469.0 54966.0 1.4396596 2.1048034 1628.4419823
GRADUAT…36 private 36 52 78.711539 15.753414 78.5 80.023809 17.79120 33.0 99.0 66.0 -0.6127581 -0.1790876 2.1846055
SAL_FULL…37 private 37 53 770.584906 98.577465 754.0 764.348837 97.85160 611.0 1009.0 398.0 0.4875072 -0.5641122 13.5406562
SAL_AC…38 private 38 53 539.415094 50.306917 536.0 534.720930 38.54760 449.0 733.0 284.0 1.2266441 2.7602908 6.9101865
SAL_AS…39 private 39 53 453.207547 39.494575 447.0 450.441861 34.09980 385.0 576.0 191.0 0.7393910 0.3554250 5.4249971
SAL_ALL…40 private 40 53 612.433962 90.184748 601.0 605.162791 77.09520 476.0 866.0 390.0 0.7542403 0.0361182 12.3878279
COMP_FUL…41 private 41 53 956.905660 120.627409 952.0 952.976744 114.16020 718.0 1236.0 518.0 0.3668762 -0.5426861 16.5694489
COMP_AC…42 private 42 53 682.584906 67.603890 682.0 677.813954 53.37360 533.0 909.0 376.0 0.7806619 1.3435425 9.2861084
COMP_AS…43 private 43 53 570.679245 57.059272 558.0 566.860465 56.33880 459.0 717.0 258.0 0.6114793 -0.0643986 7.8376938
COMP_ALL…44 private 44 53 766.094340 111.838184 744.0 759.511628 85.99080 570.0 1075.0 505.0 0.6055614 -0.0733207 15.3621560
NUM_FULL…45 private 45 53 275.792453 148.749061 236.0 261.790698 123.05580 55.0 654.0 599.0 0.8015128 -0.4075732 20.4322548
NUM_AC…46 private 46 52 168.192308 77.335287 149.0 160.404762 54.85620 40.0 382.0 342.0 0.9090695 0.2944050 10.7244747
NUM_AS…47 private 47 53 144.622641 57.649547 141.0 141.627907 53.37360 40.0 304.0 264.0 0.5337624 0.2576431 7.9187743
NUM_INS…48 private 48 53 16.150943 21.237119 8.0 12.116279 8.89560 0.0 111.0 111.0 2.2060240 5.7995390 2.9171427
NUM_ALL…49 private 49 53 625.716981 268.710623 556.0 602.093023 257.97240 161.0 1372.0 1211.0 0.7590003 -0.0133067 36.9102428
…50 public 1 123 92.934959 38.971982 97.0 95.636364 42.99540 2.0 152.0 150.0 -0.5325816 -0.6691725 3.5139853
PPIND…51 public 2 123 2.000000 0.000000 2.0 2.000000 0.00000 2.0 2.0 0.0 NaN NaN 0.0000000
FICE…52 public 3 123 3088.024390 2078.902916 2568.0 2648.797980 1395.12660 1009.0 10366.0 9357.0 2.0797258 4.1720683 187.4483633
STATE…53 public 4 123 26.577236 14.568666 27.0 26.868687 17.79120 1.0 50.0 49.0 -0.1483948 -1.2889923 1.3136124
TYPE…54 public 5 123 1.000000 0.000000 1.0 1.000000 0.00000 1.0 1.0 0.0 NaN NaN 0.0000000
AVRMATH…55 public 6 80 541.087500 49.213869 535.5 539.687500 46.70190 390.0 655.0 265.0 0.0872105 0.3668895 5.5022778
AVRVERB…56 public 7 80 473.675000 37.327756 465.0 471.078125 31.87590 391.0 600.0 209.0 0.7516249 0.7460887 4.1733700
AVRCOMB…57 public 8 80 1014.662500 84.658651 997.0 1010.687500 75.61260 810.0 1240.0 430.0 0.4180047 0.1997793 9.4651249
AVR_ACT…58 public 9 74 22.878378 1.766916 23.0 22.783333 1.48260 19.0 27.0 8.0 0.4465584 0.2074695 0.2053998
MATH_1…59 public 10 91 483.109890 56.137025 470.0 481.561644 59.30400 350.0 630.0 280.0 0.2120495 -0.3354537 5.8847592
MATH_3…60 public 11 91 609.582418 52.348844 610.0 608.794520 44.47800 480.0 730.0 250.0 0.0468177 -0.2326174 5.4876499
VERB_1…61 public 12 91 418.648352 41.648362 412.0 417.095890 40.03020 320.0 540.0 220.0 0.4096931 0.1814679 4.3659347
VERB_3…62 public 13 91 533.571429 42.218254 530.0 531.397260 44.47800 440.0 650.0 210.0 0.4561332 0.1613254 4.4256755
ACT_1…63 public 14 78 20.243590 1.948598 20.0 20.203125 1.48260 16.0 25.0 9.0 0.2533577 -0.2129912 0.2206353
ACT_3…64 public 15 78 25.589744 1.882496 25.5 25.562500 2.22390 21.0 31.0 10.0 0.1906946 0.4699948 0.2131506
APP_REC…65 public 16 122 8770.885246 6213.519109 7628.0 8115.673469 5043.80520 787.0 48094.0 47307.0 2.3449331 11.2548093 562.5455881
APP_ACC…66 public 17 122 6191.950820 3936.148165 5468.5 5770.632653 3498.19470 601.0 26330.0 25729.0 1.5943959 4.6898831 356.3621107
NEW_STUD…67 public 18 123 2594.447155 1321.726536 2408.0 2459.949495 1143.08460 233.0 7425.0 7192.0 1.0361383 1.3969684 119.1760683
NEW10…68 public 19 108 33.620370 21.866937 26.0 29.920455 11.86080 8.0 95.0 87.0 1.5420644 1.5799521 2.1041470
NEW25…69 public 20 101 58.504950 17.283879 57.0 57.691358 13.34340 24.0 99.0 75.0 0.4213560 -0.4515245 1.7198102
FULLTIME…70 public 21 123 13448.065041 6196.503540 13436.0 12986.424242 5325.49920 1017.0 31643.0 30626.0 0.6892755 0.4241041 558.7199084
PARTTIME…71 public 22 123 3108.975610 2807.979142 2411.0 2638.545454 1632.34260 114.0 21836.0 21722.0 3.0574510 14.9944532 253.1869527
IN_STATE…72 public 23 116 2537.137931 1078.590561 2408.0 2479.372340 877.69920 647.0 6210.0 5563.0 0.6645711 0.6461950 100.1446195
OUT_STAT…73 public 24 122 7657.549180 2252.312472 7446.5 7517.163265 1939.24080 2279.0 15732.0 13453.0 0.7771996 1.2996173 203.9147900
R_B_COST…74 public 25 123 3977.073171 842.490474 3811.0 3911.979798 722.02620 2082.0 6607.0 4525.0 0.7886999 0.7902446 75.9648078
ROOM…75 public 26 89 2397.505618 975.041854 1960.0 2284.013699 607.86600 1033.0 6081.0 5048.0 1.1743603 1.1457148 103.3542298
BOARD…76 public 27 69 1905.565217 434.954037 1900.0 1884.087719 385.47600 1000.0 3271.0 2271.0 0.6514170 1.0601888 52.3623128
ADD_FEE…77 public 28 90 748.388889 1015.494917 443.0 495.569444 404.00850 20.0 4374.0 4354.0 2.6479006 6.0998551 107.0425630
BOOK…78 public 29 121 600.008264 101.162122 600.0 591.103093 111.19500 400.0 858.0 458.0 0.6447669 -0.2889320 9.1965565
PERSONAL…79 public 30 113 1886.495575 761.545927 1851.0 1849.109890 816.91260 500.0 4288.0 3788.0 0.4382313 -0.1747922 71.6402146
PH_D…80 public 31 118 84.110169 7.798514 85.0 84.406250 7.41300 63.0 99.0 36.0 -0.3698537 -0.3490528 0.7179114
TERM_D…81 public 32 112 89.285714 6.765611 90.0 89.988889 5.93040 67.0 99.0 32.0 -0.9104174 0.6304258 0.6392901
SF_RATIO…82 public 33 123 16.129268 3.830202 16.5 16.178788 4.15128 6.7 24.7 18.0 -0.0994206 -0.5699091 0.3453577
DONATE…83 public 34 111 15.180180 7.924873 14.0 14.269663 7.41300 4.0 48.0 44.0 1.1950772 1.9153609 0.7521956
INSTRUCT…84 public 35 123 9228.512195 3136.136938 8612.0 8884.141414 2828.80080 3605.0 20260.0 16655.0 0.9838824 0.7050905 282.7759447
GRADUAT…85 public 36 119 54.731092 15.573063 54.0 54.412371 16.30860 10.0 95.0 85.0 0.1127780 -0.1892703 1.4275804
SAL_FULL…86 public 37 123 625.520325 76.323873 615.0 625.070707 77.09520 446.0 846.0 400.0 0.1043039 -0.3571508 6.8818919
SAL_AC…87 public 38 123 464.593496 43.609827 460.0 462.858586 40.03020 364.0 611.0 247.0 0.4857462 0.5123490 3.9321657
SAL_AS…88 public 39 123 395.357724 30.597099 394.0 394.404040 29.65200 323.0 478.0 155.0 0.2511988 -0.1399305 2.7588475
SAL_ALL…89 public 40 123 500.186992 61.477218 495.0 499.353535 65.23440 362.0 665.0 303.0 0.1876957 -0.4858715 5.5432141
COMP_FUL…90 public 41 123 771.333333 98.575565 762.0 770.383838 109.71240 537.0 1045.0 508.0 0.0888023 -0.4000660 8.8882594
COMP_AC…91 public 42 123 581.682927 60.762143 573.0 578.393939 62.26920 438.0 780.0 342.0 0.5334294 0.5391767 5.4787379
COMP_AS…92 public 43 123 496.707317 43.222590 494.0 495.515152 41.51280 395.0 624.0 229.0 0.2751813 -0.0158559 3.8972497
COMP_ALL…93 public 44 123 622.560976 79.774595 616.0 621.909091 90.43860 436.0 838.0 402.0 0.1262098 -0.5008610 7.1930330
NUM_FULL…94 public 45 123 363.113821 223.230462 297.0 335.262626 183.84240 39.0 997.0 958.0 1.0456825 0.4760854 20.1280130
NUM_AC…95 public 46 123 256.536585 125.159328 231.0 246.010101 111.19500 32.0 721.0 689.0 0.9641768 1.2571277 11.2852365
NUM_AS…96 public 47 123 210.528455 94.538077 195.0 202.020202 84.50820 29.0 510.0 481.0 0.8391736 0.6148800 8.5242113
NUM_INS…97 public 48 122 31.844262 34.957831 20.0 25.775510 26.68680 0.0 178.0 178.0 1.7910083 3.7266901 3.1649333
NUM_ALL…98 public 49 123 893.382114 440.880797 793.0 848.363636 401.78460 108.0 2261.0 2153.0 0.8715691 0.3147285 39.7528827
skim(col_I_sn)[,c("skim_variable", "numeric.hist")]|>rmarkdown::paged_table()
describe(col_I_sn_log)|>rmarkdown::paged_table()
results <- describeBy(col_I_sn_log, group=col_I_sn_log$PPIND)

df_results <- map_dfr(results, ~as.data.frame(.x), .id = "group")

kable(df_results, "html") %>%
  kable_styling(full_width = FALSE) %>%
  scroll_box(width = "100%", height = "400px")
group vars n mean sd median trimmed mad min max range skew kurtosis se
…1 private 1 53 56.283019 43.4118813 45.000000 52.116279 44.4780000 1.0000000 153.000000 152.0000000 0.6989808 -0.6825807 5.9630805
PPIND…2 private 2 53 1.000000 0.0000000 1.000000 1.000000 0.0000000 1.0000000 1.000000 0.0000000 NaN NaN 0.0000000
FICE…3 private 3 53 2466.622642 999.8575904 2506.000000 2381.883721 1111.9500000 1131.0000000 6967.000000 5836.0000000 1.6419950 5.3901228 137.3410025
STATE…4 private 4 53 25.188679 13.5562313 25.000000 25.139535 14.8260000 5.0000000 49.000000 44.0000000 -0.0172640 -1.4525694 1.8620916
TYPE…5 private 5 53 1.000000 0.0000000 1.000000 1.000000 0.0000000 1.0000000 1.000000 0.0000000 NaN NaN 0.0000000
log-AVRMATH…6 private 6 28 6.434103 0.1257932 6.469220 6.438741 0.1157016 6.1246834 6.620073 0.4953898 -0.4656246 -0.7140574 0.0237727
log-AVRVERB…7 private 7 28 6.310820 0.1264692 6.322559 6.313102 0.1609462 6.0544393 6.499787 0.4453477 -0.2000872 -1.2703684 0.0239004
log-AVRCOMB…8 private 8 29 7.063338 0.1246905 7.089243 7.066058 0.1413069 6.7833252 7.251345 0.4680198 -0.2535539 -1.0771346 0.0231544
log-AVR_ACT…9 private 9 19 3.283579 0.1177341 3.332205 3.291663 0.1509030 2.9957323 3.433987 0.4382549 -0.6329872 -0.4516265 0.0270101
log-MATH_1…10 private 10 51 6.342068 0.1371844 6.380122 6.346619 0.1435896 5.9661467 6.606650 0.6405034 -0.3773748 -0.5468103 0.0192097
log-MATH_3…11 private 11 51 6.518394 0.0926585 6.536692 6.523026 0.1037206 6.2344107 6.659294 0.4248832 -0.5832625 -0.1388006 0.0129748
log-VERB_1…12 private 12 51 6.208815 0.1309561 6.214608 6.210509 0.1680211 5.8861040 6.445720 0.5596158 -0.1522188 -0.9704001 0.0183375
log-VERB_3…13 private 13 51 6.410036 0.0990664 6.413459 6.412007 0.1267952 6.1527327 6.579251 0.4265185 -0.2237489 -0.7417599 0.0138721
log-ACT_1…14 private 14 31 3.165572 0.1223224 3.178054 3.170557 0.1290031 2.8332133 3.367296 0.5340825 -0.4827793 -0.3490919 0.0219697
log-ACT_3…15 private 15 31 3.362244 0.0912073 3.367296 3.368112 0.0988766 3.0910425 3.496508 0.4054651 -0.6997901 0.4249089 0.0163813
log-APP_REC…16 private 16 53 8.844917 0.5823906 8.942984 8.896994 0.6544108 7.4696542 9.864071 2.3944166 -0.6547206 -0.2751775 0.0799975
log-APP_ACC…17 private 17 53 8.179080 0.5441333 8.185071 8.203184 0.5508374 6.2285110 9.260653 3.0321422 -0.7464176 1.4867564 0.0747425
log-NEW_STUD…18 private 18 53 7.146480 0.5464094 7.119636 7.164144 0.4220124 5.3471075 8.437067 3.0899596 -0.4127149 1.0569984 0.0750551
log-NEW10…19 private 19 52 3.954240 0.4792113 4.059848 3.988609 0.5675439 2.7725887 4.584968 1.8123788 -0.4763167 -0.9226227 0.0664546
log-NEW25…20 private 20 49 4.360272 0.2213973 4.406719 4.383760 0.2337007 3.7376696 4.595120 0.8574502 -0.8236338 -0.2126319 0.0316282
log-FULLTIME…21 private 21 53 8.586232 0.5702424 8.559678 8.586250 0.4063457 6.8156400 10.217495 3.4018551 -0.0797713 1.2081209 0.0783288
log-PARTTIME…22 private 22 47 5.903632 1.5471494 6.182085 5.917210 1.6470689 2.7725887 9.232200 6.4596110 -0.1886845 -0.8926380 0.2256749
log-IN_STATE…23 private 23 53 9.596021 0.4126202 9.742144 9.685029 0.1631599 7.7579062 9.908475 2.1505689 -2.6026765 7.4369593 0.0566777
log-OUT_STAT…24 private 24 53 9.615242 0.3683239 9.742144 9.685652 0.1631599 7.7579062 9.908475 2.1505689 -2.8330397 10.2849805 0.0505932
log-R_B_COST…25 private 25 53 8.656172 0.1886972 8.695339 8.679479 0.1307803 8.1077201 8.912608 0.8048879 -1.0948572 0.5490446 0.0259196
log-ROOM…26 private 26 45 8.079043 0.2416058 8.122668 8.080739 0.1744052 7.5600805 8.848653 1.2885724 0.1707309 0.9183040 0.0360165
log-BOARD…27 private 27 43 7.895760 0.2681052 7.928406 7.905355 0.2342571 7.2442275 8.468003 1.2237754 -0.3530655 0.1162396 0.0408857
log-ADD_FEE…28 private 28 46 5.838785 0.7936613 6.003811 5.872662 0.7083206 3.6888795 7.515345 3.8264651 -0.4493357 0.0057261 0.1170190
log-BOOK…29 private 29 53 6.383096 0.2476752 6.396930 6.382047 0.2409506 5.7037825 7.114769 1.4109870 0.0914956 0.9139729 0.0340208
log-PERSONAL…30 private 30 52 7.198656 0.4484385 7.170120 7.184433 0.3889813 5.7037825 8.824678 3.1208954 0.3667273 3.5127485 0.0621872
log-PH_D…31 private 31 49 4.491264 0.0917167 4.510859 4.501068 0.0793023 4.2626799 4.595120 0.3324400 -0.9322867 -0.2313391 0.0131024
log-TERM_D…32 private 32 48 4.534896 0.0642132 4.553877 4.544888 0.0460949 4.3307333 4.595120 0.2643865 -1.5807128 2.1534537 0.0092684
log-SF_RATIO…33 private 33 53 2.182168 0.4676187 2.219203 2.198625 0.5685566 1.0647107 3.020425 1.9557141 -0.2702173 -0.7508554 0.0642324
log-DONATE…34 private 34 53 3.186231 0.5080743 3.218876 3.222420 0.5406199 1.3862944 3.988984 2.6026897 -0.8624424 1.3041926 0.0697894
log-INSTRUCT…35 private 35 53 9.829992 0.5074521 9.813946 9.809155 0.5435289 8.9230582 11.042426 2.1193675 0.3173107 -0.6541676 0.0697039
log-GRADUAT…36 private 36 52 4.342732 0.2274029 4.362916 4.371913 0.2345820 3.4965076 4.595120 1.0986123 -1.2954099 2.1762397 0.0315351
log-SAL_FULL…37 private 37 53 6.639312 0.1257611 6.625392 6.634831 0.1358114 6.4150970 6.916715 0.5016181 0.2585516 -0.7606861 0.0172746
log-SAL_AC…38 private 38 53 6.286460 0.0894229 6.284134 6.280319 0.0708158 6.1070229 6.597146 0.4901228 0.8236596 1.4983290 0.0122832
log-SAL_AS…39 private 39 53 6.112745 0.0850986 6.102559 6.108573 0.0743881 5.9532433 6.356108 0.4028643 0.5157075 -0.0606235 0.0116892
log-SAL_ALL…40 private 40 53 6.407302 0.1422726 6.398595 6.400436 0.1341701 6.1654179 6.763885 0.5984671 0.4486689 -0.4286232 0.0195426
log-COMP_FUL…41 private 41 53 6.856006 0.1250443 6.858565 6.855704 0.1250442 6.5764696 7.119636 0.5431661 0.1037261 -0.5796099 0.0171762
log-COMP_AC…42 private 42 53 6.521235 0.0967251 6.525030 6.517304 0.0804016 6.2785214 6.812345 0.5338237 0.3912482 0.7753791 0.0132862
log-COMP_AS…43 private 43 53 6.342060 0.0979777 6.324359 6.337889 0.1026431 6.1290502 6.575076 0.4460256 0.3776698 -0.2911012 0.0134583
log-COMP_ALL…44 private 44 53 6.631183 0.1427528 6.612041 6.627992 0.1181732 6.3456364 6.980076 0.6344396 0.2691417 -0.3877691 0.0196086
log-NUM_FULL…45 private 45 53 5.476724 0.5499620 5.463832 5.486443 0.5456719 4.0073332 6.483107 2.4757742 -0.1371297 -0.4976673 0.0755431
log-NUM_AC…46 private 46 52 5.022570 0.4670540 5.003946 5.035011 0.3979307 3.6888795 5.945421 2.2565412 -0.2868779 0.1719312 0.0647687
log-NUM_AS…47 private 47 53 4.887067 0.4440564 4.948760 4.919891 0.3546819 3.6888795 5.717028 2.0281482 -0.7460517 0.5711032 0.0609958
log-NUM_INS…48 private 48 43 2.517066 0.9652043 2.302585 2.487473 1.0276600 0.6931472 4.709530 4.0163830 0.3615373 -0.8604112 0.1471923
log-NUM_ALL…49 private 49 53 6.348085 0.4383580 6.320768 6.355203 0.4769799 5.0814044 7.224025 2.1426204 -0.2412857 -0.0981527 0.0602131
…50 public 1 123 92.934959 38.9719823 97.000000 95.636364 42.9954000 2.0000000 152.000000 150.0000000 -0.5325816 -0.6691725 3.5139853
PPIND…51 public 2 123 2.000000 0.0000000 2.000000 2.000000 0.0000000 2.0000000 2.000000 0.0000000 NaN NaN 0.0000000
FICE…52 public 3 123 3088.024390 2078.9029163 2568.000000 2648.797980 1395.1266000 1009.0000000 10366.000000 9357.0000000 2.0797258 4.1720683 187.4483633
STATE…53 public 4 123 26.577236 14.5686662 27.000000 26.868687 17.7912000 1.0000000 50.000000 49.0000000 -0.1483948 -1.2889923 1.3136124
TYPE…54 public 5 123 1.000000 0.0000000 1.000000 1.000000 0.0000000 1.0000000 1.000000 0.0000000 NaN NaN 0.0000000
log-AVRMATH…55 public 6 80 6.289457 0.0917565 6.283201 6.289384 0.0886188 5.9661467 6.484635 0.5184885 -0.2785880 0.9500909 0.0102587
log-AVRVERB…56 public 7 80 6.157545 0.0771287 6.142037 6.153823 0.0685254 5.9687076 6.396930 0.4282221 0.5040111 0.3491819 0.0086233
log-AVRCOMB…57 public 8 80 6.918917 0.0827251 6.904751 6.917073 0.0778487 6.6970342 7.122867 0.4258324 0.1534079 0.2074656 0.0092490
log-AVR_ACT…58 public 9 74 3.127291 0.0764830 3.135494 3.124947 0.0659042 2.9444390 3.295837 0.3513979 0.2059796 0.1667852 0.0088910
log-MATH_1…59 public 10 91 6.173557 0.1164990 6.152733 6.173952 0.1210958 5.8579332 6.445720 0.5877867 -0.0861901 -0.2361476 0.0122124
log-MATH_3…60 public 11 91 6.409099 0.0864520 6.413459 6.409949 0.0747688 6.1737861 6.593045 0.4192584 -0.1899023 -0.1233113 0.0090626
log-VERB_1…61 public 12 91 6.032198 0.0986871 6.021023 6.031259 0.0974830 5.7683210 6.291569 0.5232481 0.1073253 0.0621089 0.0103452
log-VERB_3…62 public 13 91 6.276543 0.0782974 6.272877 6.274208 0.0816316 6.0867747 6.476972 0.3901976 0.2279405 -0.0377152 0.0082078
log-ACT_1…63 public 14 78 3.003281 0.0960934 2.995732 3.003594 0.0760474 2.7725887 3.218876 0.4462871 -0.0030292 -0.2118761 0.0108804
log-ACT_3…64 public 15 78 3.239521 0.0736256 3.238486 3.240004 0.0850281 3.0445224 3.433987 0.3894648 -0.0798154 0.4378491 0.0083365
log-APP_REC…65 public 16 122 8.846410 0.7225605 8.939570 8.891467 0.7718369 6.6682282 10.780913 4.1126845 -0.4860993 0.2431071 0.0654176
log-APP_ACC…66 public 17 122 8.529329 0.6756904 8.606640 8.569983 0.6750134 6.3985949 10.178464 3.7798693 -0.5734094 0.4915048 0.0611741
log-NEW_STUD…67 public 18 123 7.721430 0.5703518 7.786552 7.759060 0.4863997 5.4510385 8.912608 3.4615695 -0.9309981 1.9272113 0.0514269
log-NEW10…68 public 19 108 3.343840 0.5733746 3.258097 3.325655 0.4824713 2.0794415 4.553877 2.4744353 0.3164250 -0.1843190 0.0551730
log-NEW25…69 public 20 101 4.024751 0.3039017 4.043051 4.034042 0.2242150 3.1780538 4.595120 1.4170660 -0.2785966 -0.2369333 0.0302394
log-FULLTIME…70 public 21 123 9.384612 0.5398449 9.505693 9.425261 0.4411550 6.9246124 10.362272 3.4376598 -1.1704252 2.9889701 0.0486762
log-PARTTIME…71 public 22 123 7.722352 0.8444922 7.787797 7.751869 0.7943942 4.7361984 9.991315 5.2551168 -0.5523266 1.2841111 0.0761453
log-IN_STATE…72 public 23 116 7.739242 0.4733181 7.786530 7.780124 0.3678349 6.4723463 8.733916 2.2615699 -0.7303445 0.5088757 0.0439465
log-OUT_STAT…73 public 24 122 8.900562 0.2987634 8.915499 8.907378 0.2465889 7.7314920 9.663452 1.9319601 -0.3875123 1.1365253 0.0270488
log-R_B_COST…74 public 25 123 8.266950 0.2066127 8.245647 8.263462 0.1902686 7.6410842 8.795885 1.1548007 0.1062924 0.3187606 0.0186296
log-ROOM…75 public 26 89 7.710137 0.3728542 7.580700 7.693559 0.3384163 6.9402225 8.712924 1.7727020 0.4491298 -0.6191358 0.0395225
log-BOARD…76 public 27 69 7.527164 0.2280129 7.549609 7.529727 0.1913003 6.9077553 8.092851 1.1850957 -0.1434297 0.1929579 0.0274495
log-ADD_FEE…77 public 28 90 6.014273 1.1296149 6.093567 6.024473 0.8723448 2.9957323 8.383433 5.3877009 -0.1980167 0.2896088 0.1190719
log-BOOK…78 public 29 121 6.383399 0.1639246 6.396930 6.375006 0.1979736 5.9914645 6.754604 0.7631396 0.3227857 -0.5787595 0.0149022
log-PERSONAL…79 public 30 113 7.452475 0.4466677 7.523481 7.479479 0.4456172 6.2146081 8.363576 2.1489676 -0.6382180 0.2629272 0.0420190
log-PH_D…80 public 31 118 4.427713 0.0952721 4.442651 4.433566 0.0898821 4.1431347 4.595120 0.4519851 -0.5973045 -0.0100275 0.0087705
log-TERM_D…81 public 32 112 4.488829 0.0791421 4.499810 4.498530 0.0674025 4.2046926 4.595120 0.3904272 -1.1374689 1.2826948 0.0074782
log-SF_RATIO…82 public 33 123 2.749871 0.2566883 2.803360 2.769087 0.2542237 1.9021075 3.206803 1.3046957 -0.6984298 0.2194364 0.0231448
log-DONATE…83 public 34 111 2.591623 0.5135815 2.639057 2.592837 0.5288063 1.3862944 3.871201 2.4849066 -0.0364465 -0.6377772 0.0487470
log-INSTRUCT…84 public 35 123 9.076856 0.3248147 9.060912 9.066732 0.3493098 8.1900770 9.916404 1.7263267 0.1715987 -0.1927329 0.0292876
log-GRADUAT…85 public 36 119 3.956547 0.3219572 3.988984 3.975559 0.2975144 2.3025851 4.553877 2.2512918 -1.2663428 4.3620818 0.0295138
log-SAL_FULL…86 public 37 123 6.431116 0.1232345 6.421622 6.434299 0.1283457 6.1003190 6.740519 0.6402004 -0.1982247 -0.3215652 0.0111117
log-SAL_AC…87 public 38 123 6.136864 0.0928413 6.131227 6.135658 0.0845640 5.8971539 6.415097 0.5179431 0.1761265 0.2161470 0.0083712
log-SAL_AS…88 public 39 123 5.976834 0.0771750 5.976351 5.976126 0.0769879 5.7776523 6.169611 0.3919584 0.0401181 -0.1748151 0.0069586
log-SAL_ALL…89 public 40 123 6.207465 0.1233843 6.204558 6.209585 0.1290031 5.8916442 6.499787 0.6081428 -0.0906247 -0.4762216 0.0111252
log-COMP_FUL…90 public 41 123 6.639907 0.1293547 6.635947 6.642845 0.1407233 6.2859981 6.951772 0.6657741 -0.2253881 -0.3104789 0.0116635
log-COMP_AC…91 public 42 123 6.360619 0.1030895 6.350886 6.357962 0.1048740 6.0822189 6.659294 0.5770750 0.1870974 0.2715204 0.0092953
log-COMP_AS…92 public 43 123 6.204264 0.0867699 6.202535 6.203972 0.0833314 5.9788858 6.436150 0.4572646 0.0246014 -0.0756552 0.0078238
log-COMP_ALL…93 public 44 123 6.425617 0.1292829 6.423247 6.428720 0.1439567 6.0776422 6.731018 0.6533759 -0.1681111 -0.4328889 0.0116570
log-NUM_FULL…94 public 45 123 5.705138 0.6411374 5.693732 5.725297 0.6582669 3.6635616 6.904751 3.2411891 -0.3157094 -0.0912803 0.0578094
log-NUM_AC…95 public 46 123 5.423423 0.5238662 5.442418 5.451205 0.4988537 3.4657359 6.580639 3.1149032 -0.6094679 0.8271179 0.0472354
log-NUM_AS…96 public 47 123 5.244416 0.4819468 5.273000 5.264631 0.4392437 3.3672958 6.234411 2.8671149 -0.6616922 1.2402107 0.0434557
log-NUM_INS…97 public 48 106 3.101015 1.1364431 3.257356 3.181644 1.1452339 0.0000000 5.181784 5.1817836 -0.6770142 0.1581346 0.1103811
log-NUM_ALL…98 public 49 123 6.670777 0.5194675 6.675823 6.684448 0.5078551 4.6821312 7.723562 3.0414312 -0.4836618 0.7437253 0.0468388
data <- col_I_sn

results <- describeBy(col_I_sn, group=col_I_sn$PPIND)

df_results <- map_dfr(results, ~as.data.frame(.x), .id = "group")

kable(df_results, "html") %>%
  kable_styling(full_width = FALSE) %>%
  scroll_box(width = "100%", height = "400px")
group vars n mean sd median trimmed mad min max range skew kurtosis se
…1 private 1 53 56.283019 43.411881 45.0 52.116279 44.47800 1.0 153.0 152.0 0.6989808 -0.6825807 5.9630805
PPIND…2 private 2 53 1.000000 0.000000 1.0 1.000000 0.00000 1.0 1.0 0.0 NaN NaN 0.0000000
FICE…3 private 3 53 2466.622642 999.857590 2506.0 2381.883721 1111.95000 1131.0 6967.0 5836.0 1.6419950 5.3901228 137.3410025
STATE…4 private 4 53 25.188679 13.556231 25.0 25.139535 14.82600 5.0 49.0 44.0 -0.0172640 -1.4525694 1.8620916
TYPE…5 private 5 53 1.000000 0.000000 1.0 1.000000 0.00000 1.0 1.0 0.0 NaN NaN 0.0000000
AVRMATH…6 private 6 28 627.392857 76.594816 645.0 628.708333 74.13000 457.0 750.0 293.0 -0.2642842 -0.9977512 14.4750597
AVRVERB…7 private 7 28 554.714286 69.140682 557.0 554.833333 90.43860 426.0 665.0 239.0 -0.0601699 -1.3487042 13.0663607
AVRCOMB…8 private 8 29 1177.034483 144.286046 1199.0 1177.680000 161.60340 883.0 1410.0 527.0 -0.0893581 -1.2296542 26.7932461
AVR_ACT…9 private 9 19 26.842105 3.041622 28.0 27.000000 4.44780 20.0 31.0 11.0 -0.4302713 -0.8546401 0.6977959
MATH_1…10 private 10 51 573.137255 76.687423 590.0 573.536585 88.95600 390.0 740.0 350.0 -0.1120325 -0.7689927 10.7383871
MATH_3…11 private 11 51 680.294118 61.393581 690.0 682.243902 74.13000 510.0 780.0 270.0 -0.3845836 -0.6015429 8.5968208
VERB_1…12 private 12 51 501.274510 64.956933 500.0 500.560976 88.95600 360.0 630.0 270.0 0.0415450 -1.1069331 9.0957900
VERB_3…13 private 13 51 610.823529 59.854726 610.0 610.829268 74.13000 470.0 720.0 250.0 -0.0480738 -0.9266880 8.3813381
ACT_1…14 private 14 31 23.870968 2.837157 24.0 23.920000 2.96520 17.0 29.0 12.0 -0.2379435 -0.8258970 0.5095685
ACT_3…15 private 15 31 28.967742 2.562341 29.0 29.080000 2.96520 22.0 33.0 11.0 -0.4485672 -0.2000764 0.4602100
APP_REC…16 private 16 53 8022.905660 3989.729894 7654.0 7900.116279 4971.15780 1754.0 19227.0 17473.0 0.4132197 -0.4657545 548.0315482
APP_ACC…17 private 17 53 4059.603774 2025.552672 3587.0 3869.139535 2145.32220 507.0 10516.0 10009.0 0.9046848 0.7021832 278.2310574
NEW_STUD…18 private 18 53 1458.603774 806.308930 1236.0 1367.465116 541.14900 210.0 4615.0 4405.0 1.5081159 3.0032572 110.7550493
NEW10…19 private 19 52 57.788461 24.379625 58.0 57.761905 33.35850 16.0 98.0 82.0 0.0381849 -1.4295240 3.3808458
NEW25…20 private 20 49 80.061224 16.181790 82.0 81.268293 19.27380 42.0 99.0 57.0 -0.4992293 -0.9830636 2.3116843
FULLTIME…21 private 21 53 6300.943396 4195.174029 5217.0 5667.976744 2206.10880 912.0 27378.0 26466.0 2.6271796 9.8371061 576.2514704
PARTTIME…22 private 22 47 978.127660 1663.402480 484.0 638.256410 625.65720 16.0 10221.0 10205.0 3.8946926 17.8900634 242.6321886
IN_STATE…23 private 23 53 15624.150943 4194.954650 17020.0 16299.906977 2772.46200 2340.0 20100.0 17760.0 -1.3898856 1.3723903 576.2213365
OUT_STAT…24 private 24 53 15747.358491 3924.905335 17020.0 16306.186047 2772.46200 2340.0 20100.0 17760.0 -1.3082365 1.3192370 539.1272103
R_B_COST…25 private 25 53 5839.811321 997.070479 5975.0 5922.255814 816.91260 3320.0 7425.0 4105.0 -0.7233444 -0.2295138 136.9581633
ROOM…26 private 26 45 3321.444444 851.135166 3370.0 3269.216216 554.49240 1920.0 6965.0 5045.0 1.5052588 5.3161502 126.8797393
BOARD…27 private 27 43 2778.860465 725.254556 2775.0 2746.914286 639.00060 1400.0 4760.0 3360.0 0.4804975 0.4064642 110.6002829
ADD_FEE…28 private 28 46 451.934783 345.880416 405.0 403.842105 289.10700 40.0 1836.0 1796.0 1.7753185 4.1159323 50.9972852
BOOK…29 private 29 53 610.075472 158.110946 600.0 597.534884 148.26000 300.0 1230.0 930.0 1.2325851 3.0965764 21.7182087
PERSONAL…30 private 30 52 1493.807692 917.587246 1300.0 1358.380952 444.78000 300.0 6800.0 6500.0 3.8445961 19.0680802 127.2464564
PH_D…31 private 31 49 89.591837 7.865850 91.0 90.317073 7.41300 71.0 99.0 28.0 -0.8078233 -0.4898133 1.1236929
TERM_D…32 private 32 48 93.395833 5.700838 95.0 94.225000 4.44780 76.0 99.0 23.0 -1.4266943 1.5793545 0.8228451
SF_RATIO…33 private 33 53 9.813208 4.326885 9.2 9.555814 4.89258 2.9 20.5 17.6 0.4758614 -0.7282486 0.5943434
DONATE…34 private 34 53 27.037736 11.977483 25.0 26.441861 11.86080 4.0 54.0 50.0 0.4168192 -0.6654638 1.6452338
INSTRUCT…35 private 35 53 21194.415094 11855.236580 18287.0 19478.418605 9826.67280 7503.0 62469.0 54966.0 1.4396596 2.1048034 1628.4419823
GRADUAT…36 private 36 52 78.711539 15.753414 78.5 80.023809 17.79120 33.0 99.0 66.0 -0.6127581 -0.1790876 2.1846055
SAL_FULL…37 private 37 53 770.584906 98.577465 754.0 764.348837 97.85160 611.0 1009.0 398.0 0.4875072 -0.5641122 13.5406562
SAL_AC…38 private 38 53 539.415094 50.306917 536.0 534.720930 38.54760 449.0 733.0 284.0 1.2266441 2.7602908 6.9101865
SAL_AS…39 private 39 53 453.207547 39.494575 447.0 450.441861 34.09980 385.0 576.0 191.0 0.7393910 0.3554250 5.4249971
SAL_ALL…40 private 40 53 612.433962 90.184748 601.0 605.162791 77.09520 476.0 866.0 390.0 0.7542403 0.0361182 12.3878279
COMP_FUL…41 private 41 53 956.905660 120.627409 952.0 952.976744 114.16020 718.0 1236.0 518.0 0.3668762 -0.5426861 16.5694489
COMP_AC…42 private 42 53 682.584906 67.603890 682.0 677.813954 53.37360 533.0 909.0 376.0 0.7806619 1.3435425 9.2861084
COMP_AS…43 private 43 53 570.679245 57.059272 558.0 566.860465 56.33880 459.0 717.0 258.0 0.6114793 -0.0643986 7.8376938
COMP_ALL…44 private 44 53 766.094340 111.838184 744.0 759.511628 85.99080 570.0 1075.0 505.0 0.6055614 -0.0733207 15.3621560
NUM_FULL…45 private 45 53 275.792453 148.749061 236.0 261.790698 123.05580 55.0 654.0 599.0 0.8015128 -0.4075732 20.4322548
NUM_AC…46 private 46 52 168.192308 77.335287 149.0 160.404762 54.85620 40.0 382.0 342.0 0.9090695 0.2944050 10.7244747
NUM_AS…47 private 47 53 144.622641 57.649547 141.0 141.627907 53.37360 40.0 304.0 264.0 0.5337624 0.2576431 7.9187743
NUM_INS…48 private 48 53 16.150943 21.237119 8.0 12.116279 8.89560 0.0 111.0 111.0 2.2060240 5.7995390 2.9171427
NUM_ALL…49 private 49 53 625.716981 268.710623 556.0 602.093023 257.97240 161.0 1372.0 1211.0 0.7590003 -0.0133067 36.9102428
…50 public 1 123 92.934959 38.971982 97.0 95.636364 42.99540 2.0 152.0 150.0 -0.5325816 -0.6691725 3.5139853
PPIND…51 public 2 123 2.000000 0.000000 2.0 2.000000 0.00000 2.0 2.0 0.0 NaN NaN 0.0000000
FICE…52 public 3 123 3088.024390 2078.902916 2568.0 2648.797980 1395.12660 1009.0 10366.0 9357.0 2.0797258 4.1720683 187.4483633
STATE…53 public 4 123 26.577236 14.568666 27.0 26.868687 17.79120 1.0 50.0 49.0 -0.1483948 -1.2889923 1.3136124
TYPE…54 public 5 123 1.000000 0.000000 1.0 1.000000 0.00000 1.0 1.0 0.0 NaN NaN 0.0000000
AVRMATH…55 public 6 80 541.087500 49.213869 535.5 539.687500 46.70190 390.0 655.0 265.0 0.0872105 0.3668895 5.5022778
AVRVERB…56 public 7 80 473.675000 37.327756 465.0 471.078125 31.87590 391.0 600.0 209.0 0.7516249 0.7460887 4.1733700
AVRCOMB…57 public 8 80 1014.662500 84.658651 997.0 1010.687500 75.61260 810.0 1240.0 430.0 0.4180047 0.1997793 9.4651249
AVR_ACT…58 public 9 74 22.878378 1.766916 23.0 22.783333 1.48260 19.0 27.0 8.0 0.4465584 0.2074695 0.2053998
MATH_1…59 public 10 91 483.109890 56.137025 470.0 481.561644 59.30400 350.0 630.0 280.0 0.2120495 -0.3354537 5.8847592
MATH_3…60 public 11 91 609.582418 52.348844 610.0 608.794520 44.47800 480.0 730.0 250.0 0.0468177 -0.2326174 5.4876499
VERB_1…61 public 12 91 418.648352 41.648362 412.0 417.095890 40.03020 320.0 540.0 220.0 0.4096931 0.1814679 4.3659347
VERB_3…62 public 13 91 533.571429 42.218254 530.0 531.397260 44.47800 440.0 650.0 210.0 0.4561332 0.1613254 4.4256755
ACT_1…63 public 14 78 20.243590 1.948598 20.0 20.203125 1.48260 16.0 25.0 9.0 0.2533577 -0.2129912 0.2206353
ACT_3…64 public 15 78 25.589744 1.882496 25.5 25.562500 2.22390 21.0 31.0 10.0 0.1906946 0.4699948 0.2131506
APP_REC…65 public 16 122 8770.885246 6213.519109 7628.0 8115.673469 5043.80520 787.0 48094.0 47307.0 2.3449331 11.2548093 562.5455881
APP_ACC…66 public 17 122 6191.950820 3936.148165 5468.5 5770.632653 3498.19470 601.0 26330.0 25729.0 1.5943959 4.6898831 356.3621107
NEW_STUD…67 public 18 123 2594.447155 1321.726536 2408.0 2459.949495 1143.08460 233.0 7425.0 7192.0 1.0361383 1.3969684 119.1760683
NEW10…68 public 19 108 33.620370 21.866937 26.0 29.920455 11.86080 8.0 95.0 87.0 1.5420644 1.5799521 2.1041470
NEW25…69 public 20 101 58.504950 17.283879 57.0 57.691358 13.34340 24.0 99.0 75.0 0.4213560 -0.4515245 1.7198102
FULLTIME…70 public 21 123 13448.065041 6196.503540 13436.0 12986.424242 5325.49920 1017.0 31643.0 30626.0 0.6892755 0.4241041 558.7199084
PARTTIME…71 public 22 123 3108.975610 2807.979142 2411.0 2638.545454 1632.34260 114.0 21836.0 21722.0 3.0574510 14.9944532 253.1869527
IN_STATE…72 public 23 116 2537.137931 1078.590561 2408.0 2479.372340 877.69920 647.0 6210.0 5563.0 0.6645711 0.6461950 100.1446195
OUT_STAT…73 public 24 122 7657.549180 2252.312472 7446.5 7517.163265 1939.24080 2279.0 15732.0 13453.0 0.7771996 1.2996173 203.9147900
R_B_COST…74 public 25 123 3977.073171 842.490474 3811.0 3911.979798 722.02620 2082.0 6607.0 4525.0 0.7886999 0.7902446 75.9648078
ROOM…75 public 26 89 2397.505618 975.041854 1960.0 2284.013699 607.86600 1033.0 6081.0 5048.0 1.1743603 1.1457148 103.3542298
BOARD…76 public 27 69 1905.565217 434.954037 1900.0 1884.087719 385.47600 1000.0 3271.0 2271.0 0.6514170 1.0601888 52.3623128
ADD_FEE…77 public 28 90 748.388889 1015.494917 443.0 495.569444 404.00850 20.0 4374.0 4354.0 2.6479006 6.0998551 107.0425630
BOOK…78 public 29 121 600.008264 101.162122 600.0 591.103093 111.19500 400.0 858.0 458.0 0.6447669 -0.2889320 9.1965565
PERSONAL…79 public 30 113 1886.495575 761.545927 1851.0 1849.109890 816.91260 500.0 4288.0 3788.0 0.4382313 -0.1747922 71.6402146
PH_D…80 public 31 118 84.110169 7.798514 85.0 84.406250 7.41300 63.0 99.0 36.0 -0.3698537 -0.3490528 0.7179114
TERM_D…81 public 32 112 89.285714 6.765611 90.0 89.988889 5.93040 67.0 99.0 32.0 -0.9104174 0.6304258 0.6392901
SF_RATIO…82 public 33 123 16.129268 3.830202 16.5 16.178788 4.15128 6.7 24.7 18.0 -0.0994206 -0.5699091 0.3453577
DONATE…83 public 34 111 15.180180 7.924873 14.0 14.269663 7.41300 4.0 48.0 44.0 1.1950772 1.9153609 0.7521956
INSTRUCT…84 public 35 123 9228.512195 3136.136938 8612.0 8884.141414 2828.80080 3605.0 20260.0 16655.0 0.9838824 0.7050905 282.7759447
GRADUAT…85 public 36 119 54.731092 15.573063 54.0 54.412371 16.30860 10.0 95.0 85.0 0.1127780 -0.1892703 1.4275804
SAL_FULL…86 public 37 123 625.520325 76.323873 615.0 625.070707 77.09520 446.0 846.0 400.0 0.1043039 -0.3571508 6.8818919
SAL_AC…87 public 38 123 464.593496 43.609827 460.0 462.858586 40.03020 364.0 611.0 247.0 0.4857462 0.5123490 3.9321657
SAL_AS…88 public 39 123 395.357724 30.597099 394.0 394.404040 29.65200 323.0 478.0 155.0 0.2511988 -0.1399305 2.7588475
SAL_ALL…89 public 40 123 500.186992 61.477218 495.0 499.353535 65.23440 362.0 665.0 303.0 0.1876957 -0.4858715 5.5432141
COMP_FUL…90 public 41 123 771.333333 98.575565 762.0 770.383838 109.71240 537.0 1045.0 508.0 0.0888023 -0.4000660 8.8882594
COMP_AC…91 public 42 123 581.682927 60.762143 573.0 578.393939 62.26920 438.0 780.0 342.0 0.5334294 0.5391767 5.4787379
COMP_AS…92 public 43 123 496.707317 43.222590 494.0 495.515152 41.51280 395.0 624.0 229.0 0.2751813 -0.0158559 3.8972497
COMP_ALL…93 public 44 123 622.560976 79.774595 616.0 621.909091 90.43860 436.0 838.0 402.0 0.1262098 -0.5008610 7.1930330
NUM_FULL…94 public 45 123 363.113821 223.230462 297.0 335.262626 183.84240 39.0 997.0 958.0 1.0456825 0.4760854 20.1280130
NUM_AC…95 public 46 123 256.536585 125.159328 231.0 246.010101 111.19500 32.0 721.0 689.0 0.9641768 1.2571277 11.2852365
NUM_AS…96 public 47 123 210.528455 94.538077 195.0 202.020202 84.50820 29.0 510.0 481.0 0.8391736 0.6148800 8.5242113
NUM_INS…97 public 48 122 31.844262 34.957831 20.0 25.775510 26.68680 0.0 178.0 178.0 1.7910083 3.7266901 3.1649333
NUM_ALL…98 public 49 123 893.382114 440.880797 793.0 848.363636 401.78460 108.0 2261.0 2153.0 0.8715691 0.3147285 39.7528827
df_long <- col_I_sn[c("PPIND", q_cols)] |>
  pivot_longer(cols = -PPIND, names_to = "attribute", values_to = "value")
df_long|>rmarkdown::paged_table()
ggplot(df_long, aes(x = attribute, y = value, color = PPIND)) +
  geom_boxplot() +
  labs(title = "Boxplots of Attributes by PPIND", x = "Attribute", y = "Value") +
  theme_bw() +
  theme(axis.text.x = element_text(size=3.5, angle = 45, hjust = 1, vjust=1)) +
    facet_wrap(~PPIND, scales="free") +
  theme(legend.position = "no")
## Warning: Removed 817 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(df_long, aes(x = attribute, y = value, color = PPIND)) +
  geom_boxplot() +
  labs(title = "Boxplots of Attributes by PPIND", x = "Attribute", y = "Value") +
  theme_bw() +
  theme(axis.text.x = element_text(size=3.5, angle = 45, hjust = 1, vjust=1))
## Warning: Removed 817 rows containing non-finite outside the scale range
## (`stat_boxplot()`).